CASE_3

Liam Phan, Michael Bigler, Tania Loureiro, William Elkiess, Dakota Cuellar and Ilyana El Mendili

2023-05-05

Packages

library(DT)
library(adabag)
FALSE Loading required package: rpart
FALSE Loading required package: caret
FALSE Loading required package: ggplot2
FALSE Loading required package: lattice
FALSE Loading required package: foreach
FALSE Loading required package: doParallel
FALSE Loading required package: iterators
FALSE Loading required package: parallel
library(rpart.plot)
library(pROC)
FALSE Type 'citation("pROC")' for a citation.
FALSE 
FALSE Attaching package: 'pROC'
FALSE The following objects are masked from 'package:stats':
FALSE 
FALSE     cov, smooth, var
library(summarytools)
library(corrplot)
FALSE corrplot 0.92 loaded
library(dplyr)
FALSE 
FALSE Attaching package: 'dplyr'
FALSE The following objects are masked from 'package:stats':
FALSE 
FALSE     filter, lag
FALSE The following objects are masked from 'package:base':
FALSE 
FALSE     intersect, setdiff, setequal, union
library(GGally)
FALSE Registered S3 method overwritten by 'GGally':
FALSE   method from   
FALSE   +.gg   ggplot2
library(fastDummies)
library(ggcorrplot)
library(klaR)
FALSE Loading required package: MASS
FALSE 
FALSE Attaching package: 'MASS'
FALSE The following object is masked from 'package:dplyr':
FALSE 
FALSE     select
library(psych)
FALSE 
FALSE Attaching package: 'psych'
FALSE The following objects are masked from 'package:ggplot2':
FALSE 
FALSE     %+%, alpha
library(MASS)
library(devtools)
FALSE Loading required package: usethis
library(ggplot2)
library(ggthemes)
library(GGally)
library(caret)
library(splitTools)
library(rpart)
library(xgboost)
FALSE 
FALSE Attaching package: 'xgboost'
FALSE The following object is masked from 'package:dplyr':
FALSE 
FALSE     slice
library(caTools)
library(dplyr)
library(caret)
library(naniar)
library(kableExtra)
FALSE 
FALSE Attaching package: 'kableExtra'
FALSE The following object is masked from 'package:dplyr':
FALSE 
FALSE     group_rows
CM_Function <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#2F4F4E')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#0D8387')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#0D8387')
  rect(250, 305, 340, 365, col='#2F4F4E')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}  

DATA AND QUICK FACTORING

df <- readxl::read_xls('Cchurn.xls')
df$international_plan <- factor(df$international_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$voice_mail_plan <- factor(df$voice_mail_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$churn <- factor(df$churn, levels = c('no', 'yes'), labels = c('0','1'))

SUMMARY

print(summarytools::dfSummary(df), method = 'render')

Data Frame Summary

df

Dimensions: 5000 x 18
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 account_length [numeric]
Mean (sd) : 100.3 (39.7)
min ≤ med ≤ max:
1 ≤ 100 ≤ 243
IQR (CV) : 54 (0.4)
218 distinct values 5000 (100.0%) 0 (0.0%)
2 international_plan [factor]
1. 0
2. 1
4527(90.5%)
473(9.5%)
5000 (100.0%) 0 (0.0%)
3 voice_mail_plan [factor]
1. 0
2. 1
3677(73.5%)
1323(26.5%)
5000 (100.0%) 0 (0.0%)
4 number_vmail_messages [numeric]
Mean (sd) : 7.8 (13.5)
min ≤ med ≤ max:
0 ≤ 0 ≤ 52
IQR (CV) : 17 (1.7)
48 distinct values 5000 (100.0%) 0 (0.0%)
5 total_day_minutes [numeric]
Mean (sd) : 180.3 (53.9)
min ≤ med ≤ max:
0 ≤ 180.1 ≤ 351.5
IQR (CV) : 72.5 (0.3)
1961 distinct values 5000 (100.0%) 0 (0.0%)
6 total_day_calls [numeric]
Mean (sd) : 100 (19.8)
min ≤ med ≤ max:
0 ≤ 100 ≤ 165
IQR (CV) : 26 (0.2)
123 distinct values 5000 (100.0%) 0 (0.0%)
7 total_day_charge [numeric]
Mean (sd) : 30.6 (9.2)
min ≤ med ≤ max:
0 ≤ 30.6 ≤ 59.8
IQR (CV) : 12.3 (0.3)
1961 distinct values 5000 (100.0%) 0 (0.0%)
8 total_eve_minutes [numeric]
Mean (sd) : 200.6 (50.6)
min ≤ med ≤ max:
0 ≤ 201 ≤ 363.7
IQR (CV) : 67.7 (0.3)
1879 distinct values 5000 (100.0%) 0 (0.0%)
9 total_eve_calls [numeric]
Mean (sd) : 100.2 (19.8)
min ≤ med ≤ max:
0 ≤ 100 ≤ 170
IQR (CV) : 27 (0.2)
126 distinct values 5000 (100.0%) 0 (0.0%)
10 total_eve_charge [numeric]
Mean (sd) : 17.1 (4.3)
min ≤ med ≤ max:
0 ≤ 17.1 ≤ 30.9
IQR (CV) : 5.8 (0.3)
1659 distinct values 5000 (100.0%) 0 (0.0%)
11 total_night_minutes [numeric]
Mean (sd) : 200.4 (50.5)
min ≤ med ≤ max:
0 ≤ 200.4 ≤ 395
IQR (CV) : 67.8 (0.3)
1853 distinct values 5000 (100.0%) 0 (0.0%)
12 total_night_calls [numeric]
Mean (sd) : 99.9 (20)
min ≤ med ≤ max:
0 ≤ 100 ≤ 175
IQR (CV) : 26 (0.2)
131 distinct values 5000 (100.0%) 0 (0.0%)
13 total_night_charge [numeric]
Mean (sd) : 9 (2.3)
min ≤ med ≤ max:
0 ≤ 9 ≤ 17.8
IQR (CV) : 3.1 (0.3)
1028 distinct values 5000 (100.0%) 0 (0.0%)
14 total_intl_minutes [numeric]
Mean (sd) : 10.3 (2.8)
min ≤ med ≤ max:
0 ≤ 10.3 ≤ 20
IQR (CV) : 3.5 (0.3)
170 distinct values 5000 (100.0%) 0 (0.0%)
15 total_intl_calls [numeric]
Mean (sd) : 4.4 (2.5)
min ≤ med ≤ max:
0 ≤ 4 ≤ 20
IQR (CV) : 3 (0.6)
21 distinct values 5000 (100.0%) 0 (0.0%)
16 total_intl_charge [numeric]
Mean (sd) : 2.8 (0.7)
min ≤ med ≤ max:
0 ≤ 2.8 ≤ 5.4
IQR (CV) : 0.9 (0.3)
170 distinct values 5000 (100.0%) 0 (0.0%)
17 number_customer_service_calls [numeric]
Mean (sd) : 1.6 (1.3)
min ≤ med ≤ max:
0 ≤ 1 ≤ 9
IQR (CV) : 1 (0.8)
0:1023(20.5%)
1:1786(35.7%)
2:1127(22.5%)
3:665(13.3%)
4:252(5.0%)
5:96(1.9%)
6:34(0.7%)
7:13(0.3%)
8:2(0.0%)
9:2(0.0%)
5000 (100.0%) 0 (0.0%)
18 churn [factor]
1. 0
2. 1
4293(85.9%)
707(14.1%)
5000 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.3)
2023-05-05

  • We have no missing values -> perfect
  • Heavily uneven counts of dependent variable (86 % no / 14 % yes) -> maybe sample for equality / maybe not because we loose information of other data
  • Independent variables are on different scales -> standardize
  • two (maybe three) categorical predictors: International plan / voice_mail_plan (/ maybe number_customer_service_calls) -> dummy encode -> not necessary as already 0 and 1
  • Rest of data is numeric and most of the variables looks normally distributed with exception of number_vmail_messages and totat_intl_calls
    • transform these value to make them normal?
    • maybe make parts of them categorical? (recieving voice mail or not, calling internationally or not)
    • or maybe the categorical values that we have already give an indication for this
    • Test normality of variables
  • Can variables be combined? We have day / eve / night / intl calls and for each of them minutes / calls / charge. Maybe we can combine this into one metric. Maybe average cost per minute or average cost per call?

CORRELATION PLOT BEFORE DATA ENGINEERING

df_numeric <- select_if(df, is.numeric)  # Subset numeric columns with dplyr

M <- cor(df_numeric)

p.mat <- cor_pmat(df_numeric)

ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#2F4F4E", "white", "#0D8387")) 

Proves theory from before -> we can make one metric out of charge and minutes –> charge / minutes

DATA ENGINEERING

df$total_day_charge_per_minute <- ifelse(df$total_day_minutes == 0, 0, df$total_day_charge / df$total_day_minutes)
df$total_eve_charge_per_minute <- ifelse(df$total_eve_minutes == 0, 0, df$total_eve_charge / df$total_eve_minutes)
df$total_night_charge_per_minute <- ifelse(df$total_night_minutes == 0, 0, df$total_night_charge / df$total_night_minutes)
df$total_intl_charge_per_minute <- ifelse(df$total_intl_minutes == 0, 0, df$total_intl_charge / df$total_intl_minutes)
df <- subset(df, select = -c(total_day_charge, total_day_minutes, total_eve_charge, total_eve_minutes, total_night_charge, total_night_minutes, total_intl_charge, total_intl_minutes))

CORRELATION PLOT AFTER DATA ENGINEERING

df_numeric <- select_if(df, is.numeric)  # Subset numeric columns with dplyr

M <- cor(df_numeric)

p.mat <- cor_pmat(df_numeric)

ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#2F4F4E", "white", "#0D8387")) 

Now we have non-correlated data

HIGHER ORDER FEATURES

Only squaring as we have no negative data. Cubing would be needed with negative data.

# squared
df2 <- df^2
df2 <- df2[,-c(2,3,10)]
colnames(df2) <- paste0(colnames(df2), '_sqd')

df <- cbind(df,df2)

Relationship between data in higher order

# theme_set(theme_minimal())
# 
# ggpairs(
#   data = df,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind()

SAMPLING METHODS

As we have unbalanced data we need to use a sampling method to balance the classes. Hereby there are four different methods. OVER / UNDER / BOTH / ROSE.

library(ROSE)
FALSE Loaded ROSE 0.0-4
# OVER
df_OVER <- ovun.sample(churn~., data = df, method = "over")$data

table(df$churn)
FALSE 
FALSE    0    1 
FALSE 4293  707
table(df_OVER$churn)
FALSE 
FALSE    0    1 
FALSE 4293 4267
# UNDER
df_UNDER <- ovun.sample(churn~., data = df, method = "under")$data

table(df$churn)
FALSE 
FALSE    0    1 
FALSE 4293  707
table(df_UNDER$churn)
FALSE 
FALSE   0   1 
FALSE 710 707
# BOTH
df_BOTH <- ovun.sample(churn~., data = df, method = "both")$data

table(df$churn)
FALSE 
FALSE    0    1 
FALSE 4293  707
table(df_BOTH$churn)
FALSE 
FALSE    0    1 
FALSE 2476 2524
# ROSE
df_ROSE <- ROSE(churn ~ ., data = df, seed = 1, p = 0.5)$data

SAMPLING POST VISUALIZATION

# theme_set(theme_minimal())
# 
# ggpairs(
#   data = df_ROSE,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")
# 
# ggpairs(
#   data = df_OVER,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")

# ggpairs(
#   data = df_UNDER,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")

# ggpairs(
#   data = df_BOTH,
#   columns = c(1:9, 11:25),
#   mapping = aes(col = churn, alpha = .9)
# ) +
#   scale_fill_colorblind() +
#   scale_color_colorblind() +
#   labs(title = "Machine Learning Project")

TRAIN AND TEST SPLIT

As we need to test the models we need to split the sampled data.

set.seed(1)
data <- df_OVER # choose which data to use df_ROSE / df_BOTH / df_UNDER / df_OVER / df
inds <- splitTools::partition(data$churn, p = c(train = 0.7, test = 0.3))
dftrain <- data[inds$train,]
dftest <- data[inds$test,]

SCALING

As some methods need scaled data we scale the data here to be centered.

norm.value <- preProcess(dftrain, method = c("center", "scale"))
dftrain <- predict(norm.value, dftrain)
dftest <- predict(norm.value, dftest)

df_original_test <- predict(norm.value, df)

PREDICTIVE MODELS

BOOSTING

set.seed(123)

# train bagged model
mod.boost <- boosting(churn ~., data=dftrain)

predicted.boost <- factor(predict(mod.boost, dftest, type="class")$class)

PRED_BOOSTING <- predicted.boost

confmat.boost <- confusionMatrix(data=predicted.boost, reference = dftest$churn, positive = '1')

CM_Function(confmat.boost)

roc_score.boost =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.boost, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.boost ,main ="ROC curve")

BOOSTING ON ORIGINAL DATA

predicted.boost <- factor(predict(mod.boost, df_original_test, type="class")$class)

PRED_BOOSTING_ORIGINAL <- predicted.boost

confmat.boost <- confusionMatrix(data=predicted.boost, reference = df_original_test$churn, positive = '1')

CM_Function(confmat.boost)

roc_score.boost =roc(factor(df_original_test$churn, ordered=TRUE), factor(predicted.boost, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.boost ,main ="ROC curve")

CTREE

set.seed(123)

tree_full <- rpart(churn ~ ., 
              data = dftrain, 
              method = "class",  # "class" because Y is a binary factor
              minbucket = 1,
              cp = 0.00001) 

# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)

min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"]  # find the corresponding CP value, to get the "best pruned " tree


mod.pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp

rpart.plot(mod.pruned_tree, yesno = TRUE, digits =-3)

predicted.pruned_tree <- predict(mod.pruned_tree, dftest[,-c(10)], type = "class")

PRED_CTREE <- predicted.pruned_tree

confmat.prunned_tree <- confusionMatrix(data=predicted.pruned_tree, reference = dftest$churn, positive = '1')

CM_Function(confmat.prunned_tree)

roc_score.prunned_tree =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.pruned_tree, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.prunned_tree ,main ="ROC curve")

library(rattle)
FALSE Loading required package: tibble
FALSE 
FALSE Attaching package: 'tibble'
FALSE The following object is masked from 'package:summarytools':
FALSE 
FALSE     view
FALSE Loading required package: bitops
FALSE Rattle: A free graphical interface for data science with R.
FALSE Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
FALSE Type 'rattle()' to shake, rattle, and roll your data.
FALSE 
FALSE Attaching package: 'rattle'
FALSE The following object is masked from 'package:xgboost':
FALSE 
FALSE     xgboost
rpart.plot(mod.pruned_tree, yesno = TRUE, digits =-3)

# Customizing the output
pdf("CTREE.pdf",        
    width = 30, height = 30, 
    bg = "white",          
    colormodel = "rgb")          

# Creating a plot
fancyRpartPlot(mod.pruned_tree,yesno=TRUE,main="Pruned Tree",tweak=3)

# Closing the graphical device
dev.off() 
FALSE quartz_off_screen 
FALSE                 2

CTREE ON ORIGINAL DATA

predicted.pruned_tree <- predict(mod.pruned_tree, df_original_test[,-c(10)], type = "class")

PRED_CTREE_ORIGINAL <- predicted.pruned_tree

confmat.prunned_tree <- confusionMatrix(data=predicted.pruned_tree, reference = df_original_test$churn, positive = '1')

CM_Function(confmat.prunned_tree)

roc_score.prunned_tree =roc(factor(df_original_test$churn, ordered=TRUE), factor(predicted.pruned_tree, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.prunned_tree ,main ="ROC curve")

VARIABLES IMPORTANCES

relevance<-as.data.frame(mod.pruned_tree$variable.importance) #we get the ranking of the variables by importance
kable(relevance, row.names = T,col.names="Variable Importance") %>% kable_paper("hover", full_width = T) #built table
Variable Importance
total_day_charge_per_minute 629.2474
total_day_charge_per_minute_sqd 564.1473
account_length 430.9677
number_customer_service_calls 422.6086
number_customer_service_calls_sqd 375.9488
total_eve_calls 375.2453
total_night_calls 360.8875
total_eve_charge_per_minute 354.0565
total_night_charge_per_minute 350.1067
total_day_calls 334.7627
total_intl_charge_per_minute 317.3755
total_eve_calls_sqd 315.7726
total_eve_charge_per_minute_sqd 295.9379
total_night_calls_sqd 294.5691
account_length_sqd 293.5768
international_plan 290.6985
total_night_charge_per_minute_sqd 279.7621
total_intl_charge_per_minute_sqd 261.1063
total_day_calls_sqd 234.7268
total_intl_calls 217.1833
number_vmail_messages 181.5768
total_intl_calls_sqd 170.5660
number_vmail_messages_sqd 166.1282
voice_mail_plan 133.3825
relevance
FALSE                                   mod.pruned_tree$variable.importance
FALSE total_day_charge_per_minute                                  629.2474
FALSE total_day_charge_per_minute_sqd                              564.1473
FALSE account_length                                               430.9677
FALSE number_customer_service_calls                                422.6086
FALSE number_customer_service_calls_sqd                            375.9488
FALSE total_eve_calls                                              375.2453
FALSE total_night_calls                                            360.8875
FALSE total_eve_charge_per_minute                                  354.0565
FALSE total_night_charge_per_minute                                350.1067
FALSE total_day_calls                                              334.7627
FALSE total_intl_charge_per_minute                                 317.3755
FALSE total_eve_calls_sqd                                          315.7726
FALSE total_eve_charge_per_minute_sqd                              295.9379
FALSE total_night_calls_sqd                                        294.5691
FALSE account_length_sqd                                           293.5768
FALSE international_plan                                           290.6985
FALSE total_night_charge_per_minute_sqd                            279.7621
FALSE total_intl_charge_per_minute_sqd                             261.1063
FALSE total_day_calls_sqd                                          234.7268
FALSE total_intl_calls                                             217.1833
FALSE number_vmail_messages                                        181.5768
FALSE total_intl_calls_sqd                                         170.5660
FALSE number_vmail_messages_sqd                                    166.1282
FALSE voice_mail_plan                                              133.3825

BAGGING

set.seed(123)
library(ipred)
FALSE 
FALSE Attaching package: 'ipred'
FALSE The following object is masked from 'package:adabag':
FALSE 
FALSE     bagging
library(pROC)

# train bagged model
ames_bag1 <- bagging(
  formula = churn ~ .,
  data = dftrain, 
  nbagg = 100,  
  coob = TRUE,
  control = rpart.control(minsplit = 2, cp = 0)
  )

ames_bag1
FALSE 
FALSE Bagging classification trees with 100 bootstrap replications 
FALSE 
FALSE Call: bagging.data.frame(formula = churn ~ ., data = dftrain, nbagg = 100, 
FALSE     coob = TRUE, control = rpart.control(minsplit = 2, cp = 0))
FALSE 
FALSE Out-of-bag estimate of misclassification error:  0.0447
predicted <- factor(ifelse(predict(ames_bag1, dftest[,-c(10)], type = 'prob')[,2] >= 0.5, 1, 0))

PRED_BAGGING <- predicted
                    
CM_Function(confusionMatrix(data=predicted, reference = dftest$churn, positive = '1'))

roc_score=roc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
auc <- round(auc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)),4)
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
ggroc(roc_score, colour = '#0D8387', size = 1) +
 ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs(x="Specificity", y="Sensitivity")

BAGGING ON ORIGINAL DATA

predicted <- factor(ifelse(predict(ames_bag1, df_original_test[,-c(10)], type = 'prob')[,2] >= 0.5, 1, 0))

PRED_BAGGING_ORIGINAL <- predicted
                    
CM_Function(confusionMatrix(data=predicted, reference = df_original_test$churn, positive = '1'))

roc_score=roc(factor(df_original_test$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
auc <- round(auc(factor(df_original_test$churn, ordered=TRUE), factor(predicted, ordered=TRUE)),4)
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
ggroc(roc_score, colour = '#0D8387', size = 1) +
 ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs(x="Specificity", y="Sensitivity")

KNN

set.seed(1)

df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30), sensitivity = rep(0, 30))

# iterating over different ks
for(i in 1:30){
  # nearest neighbor
  KNN1 <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = i)

  # predictions response 
  KNN1.pred.valid.resp <- predict(KNN1, dftest[,-c(10)], type = "class")
  
  # predictions prob 
  KNN1.pred.valid.prob <- predict(KNN1, dftest[,-c(10)], type = "prob")[,2]
  
  # Confusionmatrix
  df$sensitivity[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$byClass[1]
  df$accuracy[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$overall[1]

}

# plot the k's
ggplot(df, aes(x=k)) + 
  geom_line(aes(y = sensitivity, colour = "Sensitivity")) + 
  geom_line(aes(y = accuracy, colour = "Accuracy")) + 
  labs(x = "Number of k nearest neighbours", 
       y = "Accuracy / Sensitivity", title = "Accuracy / Sensitivity regarding k") +
  theme_minimal() + 
  scale_y_continuous(name = "Sensitivity / Accuracy", limits = c(0.7, 1)) +
    scale_color_manual(name = "Values", values = c("Sensitivity" = "darkblue", "Accuracy" = "red")) + 
  xlim (1, 30)

mod.knn <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = 2)

predicted.knn <- predict(mod.knn, dftest[,-c(10)], type = "class")

PRED_KNN <- predicted.knn

confmat.knn <- confusionMatrix(data=predicted.knn, reference = dftest$churn, positive = '1')

CM_Function(confmat.knn)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.knn, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

KNN ON ORIGINAL DATA

predicted.knn <- predict(mod.knn, df_original_test[,-c(10)], type = "class")

PRED_KNN_ORIGINAL <- predicted.knn

confmat.knn <- confusionMatrix(data=predicted.knn, reference = df_original_test$churn, positive = '1')

CM_Function(confmat.knn)

roc_score.qda =roc(factor(df_original_test$churn, ordered=TRUE), factor(predicted.knn, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QDA

mod.qda <- qda(churn ~., data = dftrain)

predicted.qda <- predict(mod.qda, dftest[,-c(10)])$class

confmat.qda <- confusionMatrix(data=predicted.qda, reference = dftest$churn, positive = '1')

CM_Function(confmat.qda)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.qda, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QDA ON ORIGINAL DATA

predicted.qda <- predict(mod.qda, df_original_test[,-c(10)])$class

confmat.qda <- confusionMatrix(data=predicted.qda, reference = df_original_test$churn, positive = '1')

CM_Function(confmat.qda)

roc_score.qda =roc(factor(df_original_test$churn, ordered=TRUE), factor(predicted.qda, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QLOG

mod.log <- glm(churn ~., data = dftrain, family = binomial(link = "probit"))

s <- step(mod.log)
FALSE Start:  AIC=6868.42
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_day_charge_per_minute + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     account_length_sqd + number_vmail_messages_sqd + total_day_calls_sqd + 
FALSE     total_eve_calls_sqd + total_night_calls_sqd + total_intl_calls_sqd + 
FALSE     number_customer_service_calls_sqd + total_day_charge_per_minute_sqd + 
FALSE     total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd + 
FALSE     total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_day_charge_per_minute        1   6818.5 6866.5
FALSE - total_day_charge_per_minute_sqd    1   6818.5 6866.5
FALSE - total_night_calls_sqd              1   6818.9 6866.9
FALSE - total_night_calls                  1   6818.9 6866.9
FALSE - account_length_sqd                 1   6819.3 6867.3
FALSE - total_eve_calls                    1   6819.4 6867.4
FALSE - total_day_calls                    1   6819.6 6867.6
FALSE - total_eve_calls_sqd                1   6819.7 6867.7
FALSE - total_day_calls_sqd                1   6820.2 6868.2
FALSE <none>                                   6818.4 6868.4
FALSE - total_intl_charge_per_minute       1   6820.8 6868.8
FALSE - total_intl_charge_per_minute_sqd   1   6821.0 6869.0
FALSE - account_length                     1   6821.2 6869.2
FALSE - total_night_charge_per_minute_sqd  1   6821.9 6869.9
FALSE - total_night_charge_per_minute      1   6822.0 6870.0
FALSE - number_vmail_messages_sqd          1   6823.5 6871.5
FALSE - number_vmail_messages              1   6826.0 6874.0
FALSE - total_intl_calls_sqd               1   6833.2 6881.2
FALSE - total_eve_charge_per_minute        1   6839.1 6887.1
FALSE - total_eve_charge_per_minute_sqd    1   6839.1 6887.1
FALSE - voice_mail_plan                    1   6841.0 6889.0
FALSE - total_intl_calls                   1   6841.4 6889.4
FALSE - number_customer_service_calls      1   6850.3 6898.3
FALSE - number_customer_service_calls_sqd  1   7007.0 7055.0
FALSE - international_plan                 1   7426.7 7474.7
FALSE 
FALSE Step:  AIC=6866.46
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_eve_charge_per_minute + total_night_charge_per_minute + 
FALSE     total_intl_charge_per_minute + account_length_sqd + number_vmail_messages_sqd + 
FALSE     total_day_calls_sqd + total_eve_calls_sqd + total_night_calls_sqd + 
FALSE     total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_night_calls_sqd              1   6818.9 6864.9
FALSE - total_night_calls                  1   6819.0 6865.0
FALSE - account_length_sqd                 1   6819.4 6865.4
FALSE - total_eve_calls                    1   6819.4 6865.4
FALSE - total_day_calls                    1   6819.6 6865.6
FALSE - total_eve_calls_sqd                1   6819.7 6865.7
FALSE - total_day_calls_sqd                1   6820.2 6866.2
FALSE <none>                                   6818.5 6866.5
FALSE - total_intl_charge_per_minute       1   6820.8 6866.8
FALSE - total_intl_charge_per_minute_sqd   1   6821.1 6867.1
FALSE - account_length                     1   6821.3 6867.3
FALSE - total_night_charge_per_minute_sqd  1   6821.9 6867.9
FALSE - total_night_charge_per_minute      1   6822.0 6868.0
FALSE - number_vmail_messages_sqd          1   6823.5 6869.5
FALSE - number_vmail_messages              1   6826.1 6872.1
FALSE - total_intl_calls_sqd               1   6833.2 6879.2
FALSE - total_day_charge_per_minute_sqd    1   6838.3 6884.3
FALSE - total_eve_charge_per_minute        1   6839.1 6885.1
FALSE - total_eve_charge_per_minute_sqd    1   6839.1 6885.1
FALSE - voice_mail_plan                    1   6841.1 6887.1
FALSE - total_intl_calls                   1   6841.4 6887.4
FALSE - number_customer_service_calls      1   6850.4 6896.4
FALSE - number_customer_service_calls_sqd  1   7007.1 7053.1
FALSE - international_plan                 1   7426.7 7472.7
FALSE 
FALSE Step:  AIC=6864.89
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_night_calls + total_intl_calls + number_customer_service_calls + 
FALSE     total_eve_charge_per_minute + total_night_charge_per_minute + 
FALSE     total_intl_charge_per_minute + account_length_sqd + number_vmail_messages_sqd + 
FALSE     total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd + 
FALSE     number_customer_service_calls_sqd + total_day_charge_per_minute_sqd + 
FALSE     total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd + 
FALSE     total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_night_calls                  1   6819.1 6863.1
FALSE - account_length_sqd                 1   6819.8 6863.8
FALSE - total_eve_calls                    1   6819.8 6863.8
FALSE - total_eve_calls_sqd                1   6820.1 6864.1
FALSE - total_day_calls                    1   6820.1 6864.1
FALSE - total_day_calls_sqd                1   6820.7 6864.7
FALSE <none>                                   6818.9 6864.9
FALSE - total_intl_charge_per_minute       1   6821.3 6865.3
FALSE - total_intl_charge_per_minute_sqd   1   6821.5 6865.5
FALSE - account_length                     1   6821.7 6865.7
FALSE - total_night_charge_per_minute_sqd  1   6822.3 6866.3
FALSE - total_night_charge_per_minute      1   6822.4 6866.4
FALSE - number_vmail_messages_sqd          1   6823.9 6867.9
FALSE - number_vmail_messages              1   6826.4 6870.4
FALSE - total_intl_calls_sqd               1   6833.6 6877.6
FALSE - total_day_charge_per_minute_sqd    1   6838.5 6882.5
FALSE - total_eve_charge_per_minute        1   6839.8 6883.8
FALSE - total_eve_charge_per_minute_sqd    1   6839.8 6883.8
FALSE - voice_mail_plan                    1   6841.4 6885.4
FALSE - total_intl_calls                   1   6841.9 6885.9
FALSE - number_customer_service_calls      1   6850.7 6894.7
FALSE - number_customer_service_calls_sqd  1   7007.2 7051.2
FALSE - international_plan                 1   7427.9 7471.9
FALSE 
FALSE Step:  AIC=6863.08
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_intl_calls + number_customer_service_calls + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     account_length_sqd + number_vmail_messages_sqd + total_day_calls_sqd + 
FALSE     total_eve_calls_sqd + total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - account_length_sqd                 1   6820.0 6862.0
FALSE - total_eve_calls                    1   6820.0 6862.0
FALSE - total_day_calls                    1   6820.3 6862.3
FALSE - total_eve_calls_sqd                1   6820.3 6862.3
FALSE - total_day_calls_sqd                1   6820.8 6862.8
FALSE <none>                                   6819.1 6863.1
FALSE - total_intl_charge_per_minute       1   6821.4 6863.4
FALSE - total_intl_charge_per_minute_sqd   1   6821.7 6863.7
FALSE - account_length                     1   6821.9 6863.9
FALSE - total_night_charge_per_minute_sqd  1   6822.5 6864.5
FALSE - total_night_charge_per_minute      1   6822.6 6864.6
FALSE - number_vmail_messages_sqd          1   6824.1 6866.1
FALSE - number_vmail_messages              1   6826.6 6868.6
FALSE - total_intl_calls_sqd               1   6833.7 6875.7
FALSE - total_day_charge_per_minute_sqd    1   6838.7 6880.7
FALSE - total_eve_charge_per_minute        1   6840.0 6882.0
FALSE - total_eve_charge_per_minute_sqd    1   6840.0 6882.0
FALSE - voice_mail_plan                    1   6841.6 6883.6
FALSE - total_intl_calls                   1   6842.0 6884.0
FALSE - number_customer_service_calls      1   6850.8 6892.8
FALSE - number_customer_service_calls_sqd  1   7007.2 7049.2
FALSE - international_plan                 1   7428.6 7470.6
FALSE 
FALSE Step:  AIC=6861.97
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_eve_calls + 
FALSE     total_intl_calls + number_customer_service_calls + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd + 
FALSE     total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_eve_calls                    1   6820.9 6860.9
FALSE - total_day_calls                    1   6821.1 6861.1
FALSE - total_eve_calls_sqd                1   6821.1 6861.1
FALSE - total_day_calls_sqd                1   6821.7 6861.7
FALSE <none>                                   6820.0 6862.0
FALSE - total_intl_charge_per_minute       1   6822.3 6862.3
FALSE - total_intl_charge_per_minute_sqd   1   6822.6 6862.6
FALSE - total_night_charge_per_minute_sqd  1   6823.3 6863.3
FALSE - total_night_charge_per_minute      1   6823.4 6863.4
FALSE - number_vmail_messages_sqd          1   6824.9 6864.9
FALSE - number_vmail_messages              1   6827.4 6867.4
FALSE - account_length                     1   6828.9 6868.9
FALSE - total_intl_calls_sqd               1   6834.8 6874.8
FALSE - total_day_charge_per_minute_sqd    1   6839.7 6879.7
FALSE - total_eve_charge_per_minute        1   6840.8 6880.8
FALSE - total_eve_charge_per_minute_sqd    1   6840.8 6880.8
FALSE - voice_mail_plan                    1   6842.2 6882.2
FALSE - total_intl_calls                   1   6843.0 6883.0
FALSE - number_customer_service_calls      1   6852.0 6892.0
FALSE - number_customer_service_calls_sqd  1   7009.0 7049.0
FALSE - international_plan                 1   7430.3 7470.3
FALSE 
FALSE Step:  AIC=6860.89
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_intl_calls + 
FALSE     number_customer_service_calls + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd + 
FALSE     total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_eve_calls_sqd                1   6821.9 6859.9
FALSE - total_day_calls                    1   6822.0 6860.0
FALSE - total_day_calls_sqd                1   6822.5 6860.5
FALSE <none>                                   6820.9 6860.9
FALSE - total_intl_charge_per_minute       1   6823.2 6861.2
FALSE - total_intl_charge_per_minute_sqd   1   6823.5 6861.5
FALSE - total_night_charge_per_minute_sqd  1   6824.1 6862.1
FALSE - total_night_charge_per_minute      1   6824.2 6862.2
FALSE - number_vmail_messages_sqd          1   6825.8 6863.8
FALSE - number_vmail_messages              1   6828.3 6866.3
FALSE - account_length                     1   6829.9 6867.9
FALSE - total_intl_calls_sqd               1   6835.5 6873.5
FALSE - total_day_charge_per_minute_sqd    1   6840.8 6878.8
FALSE - total_eve_charge_per_minute        1   6841.7 6879.7
FALSE - total_eve_charge_per_minute_sqd    1   6841.7 6879.7
FALSE - voice_mail_plan                    1   6843.2 6881.2
FALSE - total_intl_calls                   1   6843.7 6881.7
FALSE - number_customer_service_calls      1   6852.7 6890.7
FALSE - number_customer_service_calls_sqd  1   7009.3 7047.3
FALSE - international_plan                 1   7433.2 7471.2
FALSE 
FALSE Step:  AIC=6859.87
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_day_calls + total_intl_calls + 
FALSE     number_customer_service_calls + total_eve_charge_per_minute + 
FALSE     total_night_charge_per_minute + total_intl_charge_per_minute + 
FALSE     number_vmail_messages_sqd + total_day_calls_sqd + total_intl_calls_sqd + 
FALSE     number_customer_service_calls_sqd + total_day_charge_per_minute_sqd + 
FALSE     total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd + 
FALSE     total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE - total_day_calls                    1   6822.9 6858.9
FALSE - total_day_calls_sqd                1   6823.5 6859.5
FALSE <none>                                   6821.9 6859.9
FALSE - total_intl_charge_per_minute       1   6824.2 6860.2
FALSE - total_intl_charge_per_minute_sqd   1   6824.5 6860.5
FALSE - total_night_charge_per_minute_sqd  1   6825.2 6861.2
FALSE - total_night_charge_per_minute      1   6825.3 6861.3
FALSE - number_vmail_messages_sqd          1   6826.7 6862.7
FALSE - number_vmail_messages              1   6829.3 6865.3
FALSE - account_length                     1   6830.7 6866.7
FALSE - total_intl_calls_sqd               1   6836.8 6872.8
FALSE - total_day_charge_per_minute_sqd    1   6841.7 6877.7
FALSE - total_eve_charge_per_minute        1   6842.4 6878.4
FALSE - total_eve_charge_per_minute_sqd    1   6842.4 6878.4
FALSE - voice_mail_plan                    1   6844.1 6880.1
FALSE - total_intl_calls                   1   6844.9 6880.9
FALSE - number_customer_service_calls      1   6853.8 6889.8
FALSE - number_customer_service_calls_sqd  1   7010.9 7046.9
FALSE - international_plan                 1   7434.3 7470.3
FALSE 
FALSE Step:  AIC=6858.94
FALSE churn ~ account_length + international_plan + voice_mail_plan + 
FALSE     number_vmail_messages + total_intl_calls + number_customer_service_calls + 
FALSE     total_eve_charge_per_minute + total_night_charge_per_minute + 
FALSE     total_intl_charge_per_minute + number_vmail_messages_sqd + 
FALSE     total_day_calls_sqd + total_intl_calls_sqd + number_customer_service_calls_sqd + 
FALSE     total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd + 
FALSE     total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE 
FALSE                                     Df Deviance    AIC
FALSE <none>                                   6822.9 6858.9
FALSE - total_intl_charge_per_minute       1   6825.3 6859.3
FALSE - total_intl_charge_per_minute_sqd   1   6825.5 6859.5
FALSE - total_night_charge_per_minute_sqd  1   6826.2 6860.2
FALSE - total_day_calls_sqd                1   6826.3 6860.3
FALSE - total_night_charge_per_minute      1   6826.3 6860.3
FALSE - number_vmail_messages_sqd          1   6827.8 6861.8
FALSE - number_vmail_messages              1   6830.4 6864.4
FALSE - account_length                     1   6831.9 6865.9
FALSE - total_intl_calls_sqd               1   6837.7 6871.7
FALSE - total_day_charge_per_minute_sqd    1   6843.0 6877.0
FALSE - total_eve_charge_per_minute        1   6843.3 6877.3
FALSE - total_eve_charge_per_minute_sqd    1   6843.3 6877.3
FALSE - voice_mail_plan                    1   6845.3 6879.3
FALSE - total_intl_calls                   1   6845.8 6879.8
FALSE - number_customer_service_calls      1   6854.4 6888.4
FALSE - number_customer_service_calls_sqd  1   7011.2 7045.2
FALSE - international_plan                 1   7438.5 7472.5
mod.log <- glm(s$formula, data = dftrain, family = binomial(link = "probit"))

predicted.log <- factor(ifelse(predict(mod.log, dftest[,-c(10)], type='response')>0.5,1,0))

confmat.log <- confusionMatrix(data=predicted.log, reference = dftest$churn, positive = '1')

CM_Function(confmat.log)

roc_score.log =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.log, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.log ,main ="ROC curve")

GAUSSIAN SVM

library(e1071)
mod.svm  = svm(formula = churn ~ .,
               data = dftrain,
                 type = 'C-classification', # this is because we want to make a regression classification
                 kernel = 'radial')

predicted.svm <- predict(mod.svm, dftest[,-c(10)])

confmat.svm <- confusionMatrix(data=predicted.svm, reference = dftest$churn, positive = '1')

CM_Function(confmat.svm)

roc_score.svm =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.svm, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.svm ,main ="ROC curve")

ENSEMBLES - MAJORITY VOTING

BOOSTING CTREE BAGGING

ENSEMBLES <- cbind(PRED_BOOSTING,PRED_CTREE,PRED_BAGGING)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES <- ifelse(ENSEMBLES == 2, 1, 0)
MAJORITY_VOTE <- rep(0,nrow(ENSEMBLES))


MAJORITY_VOTE <- ifelse(rowSums(ENSEMBLES) > (ncol(ENSEMBLES)-1)/2, 1, 0)

ENSEMBLES <- cbind(ENSEMBLES,MAJORITY_VOTE)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES$MAJORITY_VOTE <- as.factor(ENSEMBLES$MAJORITY_VOTE)

CM_ENSEMBLES <- confusionMatrix(data=ENSEMBLES$MAJORITY_VOTE, reference = dftest$churn, positive = '1')

CM_Function(CM_ENSEMBLES)

roc_score =roc(factor(dftest$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
auc <- round(auc(factor(dftest$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE)),4)
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
ggroc(roc_score, colour = '#0D8387', size = 1) +
 ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs(x="Specificity", y="Sensitivity")

ENSEMBLES - MAJORITY VOTING ON ORIGINAL

BOOSTING CTREE BAGGING

ENSEMBLES <- cbind(PRED_BOOSTING_ORIGINAL,PRED_CTREE_ORIGINAL,PRED_BAGGING_ORIGINAL)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES <- ifelse(ENSEMBLES == 2, 1, 0)
MAJORITY_VOTE <- rep(0,nrow(ENSEMBLES))


MAJORITY_VOTE <- ifelse(rowSums(ENSEMBLES) > (ncol(ENSEMBLES)-1)/2, 1, 0)

ENSEMBLES <- cbind(ENSEMBLES,MAJORITY_VOTE)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES$MAJORITY_VOTE <- as.factor(ENSEMBLES$MAJORITY_VOTE)

CM_ENSEMBLES <- confusionMatrix(data=ENSEMBLES$MAJORITY_VOTE, reference = df_original_test$churn, positive = '1')

CM_Function(CM_ENSEMBLES)

roc_score =roc(factor(df_original_test$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
auc <- round(auc(factor(df_original_test$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE)),4)
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
ggroc(roc_score, colour = '#0D8387', size = 1) +
 ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs(x="Specificity", y="Sensitivity")

ENSEMBLES - MAJORITY VOTING

KNN CTREE BAGGING

ENSEMBLES <- cbind(PRED_KNN,PRED_CTREE,PRED_BAGGING)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES <- ifelse(ENSEMBLES == 2, 1, 0)
MAJORITY_VOTE <- rep(0,nrow(ENSEMBLES))


MAJORITY_VOTE <- ifelse(rowSums(ENSEMBLES) > (ncol(ENSEMBLES)-1)/2, 1, 0)

ENSEMBLES <- cbind(ENSEMBLES,MAJORITY_VOTE)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES$MAJORITY_VOTE <- as.factor(ENSEMBLES$MAJORITY_VOTE)

CM_ENSEMBLES <- confusionMatrix(data=ENSEMBLES$MAJORITY_VOTE, reference = dftest$churn, positive = '1')

CM_Function(CM_ENSEMBLES)

roc_score =roc(factor(dftest$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
auc <- round(auc(factor(dftest$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE)),4)
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
ggroc(roc_score, colour = '#0D8387', size = 1) +
 ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs(x="Specificity", y="Sensitivity")

ENSEMBLES - MAJORITY VOTING ON ORIGINAL

KNN CTREE BAGGING

ENSEMBLES <- cbind(PRED_KNN_ORIGINAL,PRED_CTREE_ORIGINAL,PRED_BAGGING_ORIGINAL)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES <- ifelse(ENSEMBLES == 2, 1, 0)
MAJORITY_VOTE <- rep(0,nrow(ENSEMBLES))


MAJORITY_VOTE <- ifelse(rowSums(ENSEMBLES) > (ncol(ENSEMBLES)-1)/2, 1, 0)

ENSEMBLES <- cbind(ENSEMBLES,MAJORITY_VOTE)
ENSEMBLES <- as.data.frame(ENSEMBLES)
ENSEMBLES$MAJORITY_VOTE <- as.factor(ENSEMBLES$MAJORITY_VOTE)

CM_ENSEMBLES <- confusionMatrix(data=ENSEMBLES$MAJORITY_VOTE, reference = df_original_test$churn, positive = '1')

CM_Function(CM_ENSEMBLES)

roc_score =roc(factor(df_original_test$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
auc <- round(auc(factor(df_original_test$churn, ordered=TRUE), factor(ENSEMBLES$MAJORITY_VOTE, ordered=TRUE)),4)
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
ggroc(roc_score, colour = '#0D8387', size = 1) +
 ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs(x="Specificity", y="Sensitivity")

PRESENTATION ASSETS

rm(list = ls())
# READ DATA
df <- readxl::read_xls('Cchurn.xls')
df$international_plan <- factor(df$international_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$voice_mail_plan <- factor(df$voice_mail_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$churn <- factor(df$churn, levels = c('no', 'yes'), labels = c('No','Yes'))

# DATA ENGINEERING
df$total_day_charge_per_minute <- ifelse(df$total_day_minutes == 0, 0, df$total_day_charge / df$total_day_minutes)
df$total_eve_charge_per_minute <- ifelse(df$total_eve_minutes == 0, 0, df$total_eve_charge / df$total_eve_minutes)
df$total_night_charge_per_minute <- ifelse(df$total_night_minutes == 0, 0, df$total_night_charge / df$total_night_minutes)
df$total_intl_charge_per_minute <- ifelse(df$total_intl_minutes == 0, 0, df$total_intl_charge / df$total_intl_minutes)
df <- subset(df, select = -c(total_day_charge, total_day_minutes, total_eve_charge, total_eve_minutes, total_night_charge, total_night_minutes, total_intl_charge, total_intl_minutes))

colnames(df) <- c("Account Length", "International Plan","Voice Mail Plan","Voice Mail Messages","Total Days Calls","Total Evening Calls","Total Night Calls","Total Internation Calls","Total Customer Service Call","Customer Churn","Total Day Charge/Minute", "Total Evening Charge/Minute","Total Night Charge/Minute","Total International Charge/Minute")

PAIRS PLOT

# Load the necessary libraries
library(ggplot2)
library(GGally)
library(ggthemes)

# Set the main color palette
colors <- c("#0D8387", "#870D27")

# Create the ggpair plot 
PAIRS1 <- ggpairs(df,columns = c(1:4), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "Customer Churn = Yes  is red", caption="From Variable 1 to 4") + theme(plot.title = element_text(face = "bold")) 

PAIRS1
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

PAIRS2 <- ggpairs(df,columns = c(5:9), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "Customer Churn = Yes  is red", caption="From Variable 5 to 9") + theme(plot.title = element_text(face = "bold"))

PAIRS2

PAIRS3 <- ggpairs(df,columns = c(10:10), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "", y="Count") + theme(plot.title = element_text(face = "bold")) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + annotate("text", x = 2, y = 900, label = "14.14%", colour = "#870D27", size=8) + annotate("text", x = 1, y = 4500, label = "85.86%", colour = "#0D8387", size=8) + theme(axis.text.x=element_text(size=16))

PAIRS3

PAIRS4 <- ggpairs(df,columns = c(11:14), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "Customer Churn = Yes  is red",caption="From Variable 11 to 14") + theme(plot.title = element_text(face = "bold"))

PAIRS4

df$`Customer Churn` <- factor(df$`Customer Churn`, levels = c('No', 'Yes'), labels = c(0,1))

df$`Customer Churn` <- as.integer(df$`Customer Churn`)

df$`Customer Churn` <- df$`Customer Churn` -1 

Proportions_Churn <- sum(df$`Customer Churn`[df$`Customer Churn` == 1])/nrow(df)
Proportions_No_Churn <- 1-Proportions_Churn

Proportions of Customer who churned => 14.14% Versus 85.86% who didn’t churn.